Contents

def insert_data(conn, csv_file):
    cursor = conn.cursor()

    # Open and parse CSV file
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            try:
                # Insert into Country table
                cursor.execute("""
                    INSERT OR IGNORE INTO Country (CountryName, Status) 
                    VALUES (?, ?)
                """, (row['Country'], row['Status']))

                # Get CountryID for foreign key reference
                cursor.execute("SELECT CountryID FROM Country WHERE CountryName = ?", (row['Country'],))
                country_id = cursor.fetchone()[0]

                # Insert into YearlyData table
                cursor.execute("""
                    INSERT INTO YearlyData (
                        CountryID, Year, LifeExpectancy, AdultMortality, InfantDeaths, Alcohol,
                        PercentageExpenditure, HepatitisB, Measles, BMI, UnderFiveDeaths, Polio,
                        TotalExpenditure, Diphtheria, HIV_AIDS, GDP, Population, Thinness_1_19,
                        Thinness_5_9, IncomeComposition, Schooling
                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    country_id,
                    int(row['Year']),
                    float(row['Life expectancy ']) if row['Life expectancy '] else None,
                    int(row['Adult Mortality']) if row['Adult Mortality'] else None,
                    int(row['infant deaths']) if row['infant deaths'] else None,
                    float(row['Alcohol']) if row['Alcohol'] else None,
                    float(row['percentage expenditure']) if row['percentage expenditure'] else None,
                    int(row['Hepatitis B']) if row['Hepatitis B'] else None,
                    int(row['Measles ']) if row['Measles '] else None,
                    float(row[' BMI ']) if row[' BMI '] else None,
                    int(row['under-five deaths ']) if row['under-five deaths '] else None,
                    int(row['Polio']) if row['Polio'] else None,
                    float(row['Total expenditure']) if row['Total expenditure'] else None,
                    int(row['Diphtheria ']) if row['Diphtheria '] else None,
                    float(row[' HIV/AIDS']) if row[' HIV/AIDS'] else None,
                    float(row['GDP']) if row['GDP'] else None,
                    float(row['Population']) if row['Population'] else None,
                    float(row[' thinness  1-19 years']) if row[' thinness  1-19 years'] else None,
                    float(row[' thinness 5-9 years']) if row[' thinness 5-9 years'] else None,
                    float(row['Income composition of resources']) if row['Income composition of resources'] else None,
                    float(row['Schooling']) if row['Schooling'] else None
                ))

                # Commit after each successful insert to release the lock quickly
                conn.commit()

            except sqlite3.Error as e:
                print(f"Error: {e}")
                conn.rollback()  # Rollback in case of an error

    # Close cursor
    cursor.close()
import sqlite3
import pandas as pd

def fetch_data_to_dataframe():
    # Connect to the SQLite database
    conn = sqlite3.connect("life_expectancy.db")
    
    # SQL query without explicit casting
    query = """
        SELECT 
            c.CountryName AS Country, 
            c.Status AS Status, 
            y.Year AS Year, 
            y.LifeExpectancy AS LifeExpectancy, 
            y.AdultMortality AS AdultMortality, 
            y.InfantDeaths AS InfantDeaths, 
            y.Alcohol AS AlcoholConsumption
        FROM 
            YearlyData y
        JOIN 
            Country c ON y.CountryID = c.CountryID;
    """
    
    # Execute query and load into Pandas DataFrame
    df = pd.read_sql_query(query, conn)
    
    conn.close()
    
    # Convert columns to appropriate data types
    df['Year'] = df['Year'].astype(int)
    df['LifeExpectancy'] = pd.to_numeric(df['LifeExpectancy'], errors='coerce')
    df['AdultMortality'] = pd.to_numeric(df['AdultMortality'], errors='coerce', downcast='integer')
    df['InfantDeaths'] = pd.to_numeric(df['InfantDeaths'], errors='coerce', downcast='integer')
    df['AlcoholConsumption'] = pd.to_numeric(df['AlcoholConsumption'], errors='coerce')
    
    return df

# Fetch data and display the DataFrame
df = fetch_data_to_dataframe()
print(df.dtypes)  # Verify column data types
print(df.head())
---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
File /opt/anaconda3/lib/python3.12/site-packages/pandas/io/sql.py:2674, in SQLiteDatabase.execute(self, sql, params)
   2673 try:
-> 2674     cur.execute(sql, *args)
   2675     return cur

OperationalError: no such table: YearlyData

The above exception was the direct cause of the following exception:

DatabaseError                             Traceback (most recent call last)
Cell In[2], line 39
     36     return df
     38 # Fetch data and display the DataFrame
---> 39 df = fetch_data_to_dataframe()
     40 print(df.dtypes)  # Verify column data types
     41 print(df.head())

Cell In[2], line 25, in fetch_data_to_dataframe()
      9 query = """
     10     SELECT 
     11         c.CountryName AS Country, 
   (...)
     21         Country c ON y.CountryID = c.CountryID;
     22 """
     24 # Execute query and load into Pandas DataFrame
---> 25 df = pd.read_sql_query(query, conn)
     27 conn.close()
     29 # Convert columns to appropriate data types

File /opt/anaconda3/lib/python3.12/site-packages/pandas/io/sql.py:526, in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize, dtype, dtype_backend)
    523 assert dtype_backend is not lib.no_default
    525 with pandasSQL_builder(con) as pandas_sql:
--> 526     return pandas_sql.read_query(
    527         sql,
    528         index_col=index_col,
    529         params=params,
    530         coerce_float=coerce_float,
    531         parse_dates=parse_dates,
    532         chunksize=chunksize,
    533         dtype=dtype,
    534         dtype_backend=dtype_backend,
    535     )

File /opt/anaconda3/lib/python3.12/site-packages/pandas/io/sql.py:2738, in SQLiteDatabase.read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype, dtype_backend)
   2727 def read_query(
   2728     self,
   2729     sql,
   (...)
   2736     dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
   2737 ) -> DataFrame | Iterator[DataFrame]:
-> 2738     cursor = self.execute(sql, params)
   2739     columns = [col_desc[0] for col_desc in cursor.description]
   2741     if chunksize is not None:

File /opt/anaconda3/lib/python3.12/site-packages/pandas/io/sql.py:2686, in SQLiteDatabase.execute(self, sql, params)
   2683     raise ex from inner_exc
   2685 ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}")
-> 2686 raise ex from exc

DatabaseError: Execution failed on sql '
        SELECT 
            c.CountryName AS Country, 
            c.Status AS Status, 
            y.Year AS Year, 
            y.LifeExpectancy AS LifeExpectancy, 
            y.AdultMortality AS AdultMortality, 
            y.InfantDeaths AS InfantDeaths, 
            y.Alcohol AS AlcoholConsumption
        FROM 
            YearlyData y
        JOIN 
            Country c ON y.CountryID = c.CountryID;
    ': no such table: YearlyData
print(df.dtypes)
Country               object
Status                object
Year                   int32
LifeExpectancy         int64
AdultMortality          int8
InfantDeaths            int8
AlcoholConsumption     int64
dtype: object
import pandas as pd

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Check for missing or invalid values in 'Status'
if 'Status' not in df.columns:
    raise ValueError("'Status' column is missing from the dataset.")

# Drop rows with missing 'Status'
df = df.dropna(subset=['Status'])

# Ensure there are at least two classes in 'Status' for stratification
if len(df['Status'].unique()) < 2:
    raise ValueError("Stratification not possible: Only one class present in 'Status'.")

# Define features (X) and target (y)
X = df.drop(columns=['Life expectancy ', 'Country'])  # Drop target and unrelated columns
y = df['Life expectancy ']  # Target column

from sklearn.model_selection import train_test_split

# Perform train/test split with stratification on 'Status'
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=X['Status']
)

# Verify stratification
print("Train set distribution:")
print(X_train['Status'].value_counts(normalize=True))
print("\nTest set distribution:")
print(X_test['Status'].value_counts(normalize=True))
Train set distribution:
Status
Developing    0.825532
Developed     0.174468
Name: proportion, dtype: float64

Test set distribution:
Status
Developing    0.826531
Developed     0.173469
Name: proportion, dtype: float64
import pandas as pd
from ydata_profiling import ProfileReport

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Generate the profiling report
profile = ProfileReport(df, title="Life Expectancy Data Profile", explorative=True)

# Save the report to an HTML file (optional)
profile.to_file("life_expectancy_profile_report.html")

# Display the report in a Jupyter Notebook (if applicable)
profile.to_notebook_iframe()
import seaborn as sns
import matplotlib.pyplot as plt

# Select only numerical columns from the DataFrame
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Compute correlation matrix for numerical columns
correlation_matrix = numerical_df.corr()

# Plot heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix")
plt.show()
C:\Users\sahil\AppData\Local\Temp\ipykernel_11976\1327099509.py:14: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Drop irrelevant columns
df_cleaned = df.drop(columns=["Country"])

# Impute missing values for numerical columns
imputer = SimpleImputer(strategy="median")
numerical_columns = df_cleaned.select_dtypes(include=["float64", "int64"]).columns
df_cleaned[numerical_columns] = imputer.fit_transform(df_cleaned[numerical_columns])

# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop="first")  # Use sparse_output instead of sparse
encoded_status = encoder.fit_transform(df_cleaned[["Status"]])
df_encoded_status = pd.DataFrame(encoded_status, columns=encoder.get_feature_names_out(["Status"]))
df_cleaned = pd.concat([df_cleaned.reset_index(drop=True), df_encoded_status], axis=1).drop(columns=["Status"])

# Scale numerical features
scaler = StandardScaler()
df_cleaned[numerical_columns] = scaler.fit_transform(df_cleaned[numerical_columns])

print("Cleaned Dataset:")
print(df_cleaned.head())
Cleaned Dataset:
       Year  Life expectancy   Adult Mortality  infant deaths   Alcohol  \
0  1.621762         -0.445408         0.792119       0.268824 -1.156989   
1  1.404986         -0.981827         0.856601       0.285786 -1.156989   
2  1.188210         -0.981827         0.832421       0.302749 -1.156989   
3  0.971434         -1.023899         0.864662       0.328193 -1.156989   
4  0.754658         -1.055453         0.888843       0.345155 -1.156989   

   percentage expenditure  Hepatitis B  Measles       BMI   \
0               -0.335570    -0.783807 -0.110384 -0.967349   
1               -0.334441    -0.914281 -0.168124 -0.992434   
2               -0.334594    -0.827298 -0.173531 -1.017519   
3               -0.332096    -0.696824  0.032045 -1.042605   
4               -0.367862    -0.653333  0.051757 -1.062673   

   under-five deaths   ...  Total expenditure  Diphtheria    HIV/AIDS  \
0            0.255359  ...           0.931485    -0.735391  -0.323445   
1            0.274060  ...           0.939818    -0.862233  -0.323445   
2            0.292761  ...           0.918987    -0.777671  -0.323445   
3            0.317696  ...           1.081463    -0.650830  -0.323445   
4            0.342631  ...           0.810670    -0.608549  -0.323445   

        GDP  Population   thinness  1-19 years   thinness 5-9 years  \
0 -0.453371    0.435183               2.815209             2.775386   
1 -0.451232   -0.183349               2.883439             2.819978   
2 -0.449799    0.398066               2.928926             2.864570   
3 -0.446924   -0.120968               2.974413             2.931458   
4 -0.492539   -0.134268               3.042643             2.976051   

   Income composition of resources  Schooling  Status_Developing  
0                        -0.737973  -0.585017                1.0  
1                        -0.752600  -0.615649                1.0  
2                        -0.781853  -0.646281                1.0  
3                        -0.815982  -0.676912                1.0  
4                        -0.859862  -0.768808                1.0  

[5 rows x 21 columns]
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.impute import SimpleImputer
import mlflow
import mlflow.sklearn
from dagshub import DAGsHubLogger

# Set DagsHub credentials (replace with your actual credentials)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sahilsubhasbhaivachhani"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "379f3e12fb8b9aec32dc4a2b385747be484b3a27"

# Configure MLflow to log to DagsHub
mlflow.set_tracking_uri("https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow")  # Replace <username> and <repo>

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Define features and target variable
X = df.drop(columns=["Life expectancy ", "Country"])  # Drop target and unrelated columns
y = (df["Life expectancy "] > df["Life expectancy "].median()).astype(int)  # Binary classification based on median

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline for numerical columns (StandardScaler + Log Transformation)
numerical_columns = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
numerical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log_transform", FunctionTransformer(np.log1p)),  # Log transformation: log(1 + x)
    ("scaler", StandardScaler())
])

# Preprocessing pipeline for categorical columns (OneHotEncoding)
categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
categorical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns),
        ("cat", categorical_preprocessor, categorical_columns)
    ]
)

# Create the full pipeline with Logistic Regression
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("scaler", MinMaxScaler()),  # Apply MinMaxScaler after preprocessing
    ("classifier", LogisticRegression(max_iter=1000))
])

# Define custom scorer for F1-score and confusion matrix logging
def log_metrics(estimator, X_val, y_val):
    y_pred = estimator.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    return {"f1_score": f1, "tn": tn, "fp": fp, "fn": fn, "tp": tp}

# Cross-validation with 3/10 folds and hyperparameter tuning using GridSearchCV
param_grid = {
    "classifier__C": [0.01, 0.1, 1.0],  # Regularization strength in Logistic Regression
    "classifier__penalty": ["l2"]       # Regularization type (L2 regularization)
}
cv = StratifiedKFold(n_splits=10)  # Change to 3 for 3 folds if needed

grid_search = GridSearchCV(pipeline, param_grid=param_grid,
                           scoring="f1", cv=cv,
                           return_train_score=True)

# Initialize DagsHub logger outside of a context manager
logger = DAGsHubLogger()

# Check if there is an active MLflow run and end it if necessary
if mlflow.active_run():
    mlflow.end_run()

# Start MLflow run manually
mlflow.start_run()

try:
    # Fit the model with GridSearchCV
    grid_search.fit(X_train, y_train)

    # Log hyperparameters and best score to MLflow/DagsHub
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_f1_score", best_score)

    logger.log_hyperparams(best_params)
    logger.log_metrics({"best_cv_f1_score": best_score})

    # Evaluate on the test set and log metrics to MLflow/DagsHub
    test_metrics = log_metrics(grid_search.best_estimator_, X_test, y_test)
    
    mlflow.log_metrics(test_metrics)
    logger.log_metrics(test_metrics)

    print("Best Parameters:", best_params)
    print("Best Cross-Validation F1 Score:", best_score)
    print("Test Metrics:", test_metrics)

finally:
    # End MLflow run manually to ensure proper closure even if an error occurs
    mlflow.end_run()
๐Ÿƒ View run adorable-boar-730 at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/05e09f74f0324e57a34f300d81ff45c6
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
Best Parameters: {'classifier__C': 1.0, 'classifier__penalty': 'l2'}
Best Cross-Validation F1 Score: 0.8498685626116809
Test Metrics: {'f1_score': 0.848585690515807, 'tn': 242, 'fp': 55, 'fn': 36, 'tp': 255}
๐Ÿƒ View run unequaled-bird-53 at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/6193fa47e6cb49f88c7ef1f2686e0727
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
import mlflow
import mlflow.sklearn
from dagshub import DAGsHubLogger

# Set DagsHub credentials (replace with your actual credentials)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sahilsubhasbhaivachhani"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "379f3e12fb8b9aec32dc4a2b385747be484b3a27"

# Configure MLflow to log to DagsHub
mlflow.set_tracking_uri("https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow")

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Define features and target variable
X = df.drop(columns=["Life expectancy ", "Country"])  # Drop target and unrelated columns
y = (df["Life expectancy "] > df["Life expectancy "].median()).astype(int)  # Binary classification based on median

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline for numerical columns (StandardScaler + Log Transformation)
numerical_columns = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
numerical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log_transform", FunctionTransformer(np.log1p)),  # Log transformation: log(1 + x)
    ("scaler", StandardScaler())
])

# Preprocessing pipeline for categorical columns (OneHotEncoding)
categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
categorical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns),
        ("cat", categorical_preprocessor, categorical_columns)
    ]
)

# List of classifiers to evaluate
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RidgeClassifier": RidgeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Initialize DagsHub logger outside of a context manager
logger = DAGsHubLogger()

# Check if there is an active MLflow run and end it if necessary
if mlflow.active_run():
    mlflow.end_run()

# Start MLflow run manually for each classifier
for clf_name, clf in classifiers.items():
    with mlflow.start_run(run_name=clf_name):
        try:
            # Create the full pipeline with the current classifier
            pipeline = Pipeline(steps=[
                ("preprocessor", preprocessor),
                ("scaler", MinMaxScaler()),  # Apply MinMaxScaler after preprocessing
                ("classifier", clf)
            ])

            # Perform cross-validation (Stratified K-Fold)
            cv = StratifiedKFold(n_splits=10)  # 10-fold cross-validation
            f1_scorer = make_scorer(f1_score)
            cv_scores = cross_val_score(pipeline, X_train, y_train, scoring=f1_scorer, cv=cv)

            # Fit the model on the entire training set for final evaluation on test set
            pipeline.fit(X_train, y_train)

            # Evaluate on the test set and compute metrics
            y_pred = pipeline.predict(X_test)
            f1 = f1_score(y_test, y_pred)
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

            # Log hyperparameters and metrics to MLflow/DagsHub
            mlflow.log_param("classifier", clf_name)
            mlflow.log_metric("mean_cv_f1_score", np.mean(cv_scores))
            mlflow.log_metric("std_cv_f1_score", np.std(cv_scores))
            mlflow.log_metric("test_f1_score", f1)
            mlflow.log_metric("tn", tn)
            mlflow.log_metric("fp", fp)
            mlflow.log_metric("fn", fn)
            mlflow.log_metric("tp", tp)

            logger.log_hyperparams({"classifier": clf_name})
            logger.log_metrics({
                "mean_cv_f1_score": np.mean(cv_scores),
                "std_cv_f1_score": np.std(cv_scores),
                "test_f1_score": f1,
                "tn": tn,
                "fp": fp,
                "fn": fn,
                "tp": tp,
            })

            print(f"Classifier: {clf_name}")
            print(f"Mean CV F1-Score: {np.mean(cv_scores):.4f}")
            print(f"Test F1-Score: {f1:.4f}")
            print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

        finally:
            # End MLflow run manually to ensure proper closure even if an error occurs
            mlflow.end_run()
Classifier: LogisticRegression
Mean CV F1-Score: 0.8499
Test F1-Score: 0.8486
Confusion Matrix: TN=242, FP=55, FN=36, TP=255
๐Ÿƒ View run LogisticRegression at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/2af4119bd0ef438087f7249714a4e5b1
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
Classifier: RidgeClassifier
Mean CV F1-Score: 0.8479
Test F1-Score: 0.8498
Confusion Matrix: TN=232, FP=65, FN=28, TP=263
๐Ÿƒ View run RidgeClassifier at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/93267615d31e43bca6f95348e9f4242c
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
Classifier: RandomForestClassifier
Mean CV F1-Score: 0.9437
Test F1-Score: 0.9543
Confusion Matrix: TN=279, FP=18, FN=9, TP=282
๐Ÿƒ View run RandomForestClassifier at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/57721e18d26d43d2a3f53e4ef13d85e2
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:13] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:13] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:13] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:14] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Classifier: XGBClassifier
Mean CV F1-Score: 0.9422
Test F1-Score: 0.9577
Confusion Matrix: TN=280, FP=17, FN=8, TP=283
๐Ÿƒ View run XGBClassifier at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/f3d3381fc85c4b54adadfad92a9a142e
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
import mlflow
import mlflow.sklearn
from dagshub import DAGsHubLogger

# Set DagsHub credentials (replace with your actual credentials)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sahilsubhasbhaivachhani"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "379f3e12fb8b9aec32dc4a2b385747be484b3a27"

# Configure MLflow to log to DagsHub
mlflow.set_tracking_uri("https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow")

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Feature Engineering: Create new features and combine existing attributes
df['Alcohol_GDP_Ratio'] = df['Alcohol'] / (df['GDP'] + 1e-6)  # Avoid division by zero
df['Mortality_Ratio'] = df['Adult Mortality'] / (df['infant deaths'] + 1e-6)
df['Expenditure_Per_Capita'] = df['percentage expenditure'] / (df['Population'] + 1e-6)
df['BMI_Schooling_Interaction'] = df[' BMI '] * df['Schooling']

# Define features and target variable
X = df.drop(columns=["Life expectancy ", "Country"])  # Drop target and unrelated columns
y = (df["Life expectancy "] > df["Life expectancy "].median()).astype(int)  # Binary classification based on median

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline for numerical columns (StandardScaler + Log Transformation)
numerical_columns = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
numerical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log_transform", FunctionTransformer(np.log1p)),  # Log transformation: log(1 + x)
    ("scaler", StandardScaler())
])

# Preprocessing pipeline for categorical columns (OneHotEncoding)
categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
categorical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns),
        ("cat", categorical_preprocessor, categorical_columns)
    ]
)

# List of classifiers to evaluate after feature engineering
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Initialize DagsHub logger outside of a context manager
logger = DAGsHubLogger()

# Check if there is an active MLflow run and end it if necessary
if mlflow.active_run():
    mlflow.end_run()

# Start MLflow run manually for each classifier after feature engineering
for clf_name, clf in classifiers.items():
    with mlflow.start_run(run_name=f"{clf_name}_Feature_Engineering"):
        try:
            # Create the full pipeline with the current classifier
            pipeline = Pipeline(steps=[
                ("preprocessor", preprocessor),
                ("scaler", MinMaxScaler()),  # Apply MinMaxScaler after preprocessing
                ("classifier", clf)
            ])

            # Perform cross-validation (Stratified K-Fold)
            cv = StratifiedKFold(n_splits=10)  # 10-fold cross-validation
            f1_scorer = make_scorer(f1_score)
            cv_scores = cross_val_score(pipeline, X_train, y_train, scoring=f1_scorer, cv=cv)

            # Fit the model on the entire training set for final evaluation on test set
            pipeline.fit(X_train, y_train)

            # Evaluate on the test set and compute metrics
            y_pred = pipeline.predict(X_test)
            f1 = f1_score(y_test, y_pred)
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

            # Log hyperparameters and metrics to MLflow/DagsHub
            mlflow.log_param("classifier", clf_name)
            mlflow.log_metric("mean_cv_f1_score", np.mean(cv_scores))
            mlflow.log_metric("std_cv_f1_score", np.std(cv_scores))
            mlflow.log_metric("test_f1_score", f1)
            mlflow.log_metric("tn", tn)
            mlflow.log_metric("fp", fp)
            mlflow.log_metric("fn", fn)
            mlflow.log_metric("tp", tp)

            logger.log_hyperparams({"classifier": clf_name})
            logger.log_metrics({
                "mean_cv_f1_score": np.mean(cv_scores),
                "std_cv_f1_score": np.std(cv_scores),
                "test_f1_score": f1,
                "tn": tn,
                "fp": fp,
                "fn": fn,
                "tp": tp,
            })

            print(f"Classifier: {clf_name}")
            print(f"Mean CV F1-Score: {np.mean(cv_scores):.4f}")
            print(f"Test F1-Score: {f1:.4f}")
            print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

        finally:
            # End MLflow run manually to ensure proper closure even if an error occurs
            mlflow.end_run()
Classifier: LogisticRegression
Mean CV F1-Score: 0.8494
Test F1-Score: 0.8495
Confusion Matrix: TN=244, FP=53, FN=37, TP=254
๐Ÿƒ View run LogisticRegression_Feature_Engineering at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/d4bedd8af22545928910c74e3ec38d50
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
Classifier: RandomForestClassifier
Mean CV F1-Score: 0.9439
Test F1-Score: 0.9561
Confusion Matrix: TN=279, FP=18, FN=8, TP=283
๐Ÿƒ View run RandomForestClassifier_Feature_Engineering at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/a89791d41276453f864ea79a3e71bfde
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:46] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:46] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:47] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:47] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:47] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:47] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:47] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:48] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:48] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:48] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:01:48] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Classifier: XGBClassifier
Mean CV F1-Score: 0.9459
Test F1-Score: 0.9532
Confusion Matrix: TN=275, FP=22, FN=6, TP=285
๐Ÿƒ View run XGBClassifier_Feature_Engineering at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/58153cdd01894d439f8f57906a6e2d97
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
import mlflow
import mlflow.sklearn
from dagshub import DAGsHubLogger

# Set DagsHub credentials (replace with your actual credentials)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sahilsubhasbhaivachhani"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "379f3e12fb8b9aec32dc4a2b385747be484b3a27"

# Configure MLflow to log to DagsHub
mlflow.set_tracking_uri("https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow")

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Feature Engineering: Create new features and combine existing attributes
df['Alcohol_GDP_Ratio'] = df['Alcohol'] / (df['GDP'] + 1e-6)  # Avoid division by zero
df['Mortality_Ratio'] = df['Adult Mortality'] / (df['infant deaths'] + 1e-6)
df['Expenditure_Per_Capita'] = df['percentage expenditure'] / (df['Population'] + 1e-6)
df['BMI_Schooling_Interaction'] = df[' BMI '] * df['Schooling']

# Define features and target variable
X = df.drop(columns=["Life expectancy ", "Country"])  # Drop target and unrelated columns
y = (df["Life expectancy "] > df["Life expectancy "].median()).astype(int)  # Binary classification based on median

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline for numerical columns (StandardScaler + Log Transformation)
numerical_columns = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
numerical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log_transform", FunctionTransformer(np.log1p)),  # Log transformation: log(1 + x)
    ("scaler", StandardScaler())
])

# Preprocessing pipeline for categorical columns (OneHotEncoding)
categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
categorical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns),
        ("cat", categorical_preprocessor, categorical_columns)
    ]
)

# List of classifiers to evaluate after feature engineering
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Initialize DagsHub logger outside of a context manager
logger = DAGsHubLogger()

# Check if there is an active MLflow run and end it if necessary
if mlflow.active_run():
    mlflow.end_run()

# Start MLflow run manually for each classifier after feature engineering
for clf_name, clf in classifiers.items():
    with mlflow.start_run(run_name=f"{clf_name}_Feature_Engineering"):
        try:
            # Create the full pipeline with the current classifier
            pipeline = Pipeline(steps=[
                ("preprocessor", preprocessor),
                ("scaler", MinMaxScaler()),  # Apply MinMaxScaler after preprocessing
                ("classifier", clf)
            ])

            # Perform cross-validation (Stratified K-Fold)
            cv = StratifiedKFold(n_splits=10)  # 10-fold cross-validation
            f1_scorer = make_scorer(f1_score)
            cv_scores = cross_val_score(pipeline, X_train, y_train, scoring=f1_scorer, cv=cv)

            # Fit the model on the entire training set for final evaluation on test set
            pipeline.fit(X_train, y_train)

            # Evaluate on the test set and compute metrics
            y_pred = pipeline.predict(X_test)
            f1 = f1_score(y_test, y_pred)
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

            # Log hyperparameters and metrics to MLflow/DagsHub
            mlflow.log_param("classifier", clf_name)
            mlflow.log_metric("mean_cv_f1_score", np.mean(cv_scores))
            mlflow.log_metric("std_cv_f1_score", np.std(cv_scores))
            mlflow.log_metric("test_f1_score", f1)
            mlflow.log_metric("tn", tn)
            mlflow.log_metric("fp", fp)
            mlflow.log_metric("fn", fn)
            mlflow.log_metric("tp", tp)

            logger.log_hyperparams({"classifier": clf_name})
            logger.log_metrics({
                "mean_cv_f1_score": np.mean(cv_scores),
                "std_cv_f1_score": np.std(cv_scores),
                "test_f1_score": f1,
                "tn": tn,
                "fp": fp,
                "fn": fn,
                "tp": tp,
            })

            print(f"Classifier: {clf_name}")
            print(f"Mean CV F1-Score: {np.mean(cv_scores):.4f}")
            print(f"Test F1-Score: {f1:.4f}")
            print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

        finally:
            # End MLflow run manually to ensure proper closure even if an error occurs
            mlflow.end_run()
Classifier: LogisticRegression
Mean CV F1-Score: 0.8494
Test F1-Score: 0.8495
Confusion Matrix: TN=244, FP=53, FN=37, TP=254
๐Ÿƒ View run LogisticRegression_Feature_Engineering at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/8c45e04f15954d2e9c1e5f0a718ef015
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
Classifier: RandomForestClassifier
Mean CV F1-Score: 0.9417
Test F1-Score: 0.9459
Confusion Matrix: TN=276, FP=21, FN=11, TP=280
๐Ÿƒ View run RandomForestClassifier_Feature_Engineering at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/3653e9d6e51f44d0af39588c11ce568a
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:19] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:19] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:20] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:20] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:20] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:20] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:20] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:20] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:21] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:21] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:02:21] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Classifier: XGBClassifier
Mean CV F1-Score: 0.9459
Test F1-Score: 0.9532
Confusion Matrix: TN=275, FP=22, FN=6, TP=285
๐Ÿƒ View run XGBClassifier_Feature_Engineering at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/8d815d99dbd8480fa9200e9e93b038d6
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import mlflow
from dagshub import DAGsHubLogger

# Set DagsHub credentials (replace with your actual credentials)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sahilsubhasbhaivachhani"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "379f3e12fb8b9aec32dc4a2b385747be484b3a27"

# Configure MLflow to log to DagsHub
mlflow.set_tracking_uri("https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow")

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Define features and target variable
X = df.drop(columns=["Life expectancy ", "Country"])  # Drop target and unrelated columns
y = (df["Life expectancy "] > df["Life expectancy "].median()).astype(int)  # Binary classification

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline for numerical columns (StandardScaler)
numerical_columns = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
numerical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Preprocessing pipeline for categorical columns (OneHotEncoding)
categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
categorical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns),
        ("cat", categorical_preprocessor, categorical_columns)
    ]
)

# Apply preprocessing to training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Perform PCA for dimensionality reduction
pca = PCA()
X_train_pca = pca.fit_transform(X_train_preprocessed)

# Scree plot to visualize explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio.cumsum(), marker='o', linestyle='--')
plt.title('Scree Plot')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid()
plt.savefig("scree_plot.png")
plt.show()

# Select optimal number of components (e.g., 95% variance explained)
n_components_optimal = np.argmax(explained_variance_ratio.cumsum() >= 0.95) + 1
pca_optimal = PCA(n_components=n_components_optimal)
X_train_reduced = pca_optimal.fit_transform(X_train_preprocessed)
X_test_reduced = pca_optimal.transform(X_test_preprocessed)

# Log results in MLflow/DagsHub
mlflow.set_experiment("PCA Experiment")
logger = DAGsHubLogger()

with mlflow.start_run(run_name="PCA_Dimensionality_Reduction"):
    # Log parameters and metrics to MLflow/DagsHub
    mlflow.log_param("n_components", n_components_optimal)
    mlflow.log_artifact("scree_plot.png")  # Log scree plot as an artifact
    
    logger.log_hyperparams({"n_components": n_components_optimal})
    
    print(f"Optimal number of components: {n_components_optimal}")
C:\Users\sahil\AppData\Local\Temp\ipykernel_11976\945503783.py:70: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()
Optimal number of components: 14
๐Ÿƒ View run PCA_Dimensionality_Reduction at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/2766daac505146f79f6e926231eafb85
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
import mlflow
from dagshub import DAGsHubLogger

# Set DagsHub credentials (replace with your actual credentials)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sahilsubhasbhaivachhani"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "379f3e12fb8b9aec32dc4a2b385747be484b3a27"

# Configure MLflow to log to DagsHub
mlflow.set_tracking_uri("https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow")

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Define features and target variable
X = df.drop(columns=["Life expectancy ", "Country"])  # Drop target and unrelated columns
y = (df["Life expectancy "] > df["Life expectancy "].median()).astype(int)  # Binary classification based on median

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline for numerical columns (StandardScaler + Log Transformation)
numerical_columns = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
numerical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),  # Log transformation: log(1 + x)
    ("scaler", StandardScaler())
])

# Preprocessing pipeline for categorical columns (OneHotEncoding)
categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
categorical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns),
        ("cat", categorical_preprocessor, categorical_columns)
    ]
)

# Feature selection methods to evaluate
feature_selection_methods = {
    "VarianceThreshold": VarianceThreshold(threshold=0.01),
    "SelectKBest": SelectKBest(score_func=f_classif, k=10)
}

# List of classifiers to evaluate after feature selection
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Initialize DagsHub logger outside of a context manager
logger = DAGsHubLogger()

# Perform feature selection and evaluate each classifier with selected features
for fs_name, fs_method in feature_selection_methods.items():
    for clf_name, clf in classifiers.items():
        with mlflow.start_run(run_name=f"{fs_name}_{clf_name}"):
            # Apply preprocessing pipeline before feature selection
            preprocessed_X_train = preprocessor.fit_transform(X_train)
            preprocessed_X_test = preprocessor.transform(X_test)

            # Apply feature selection method
            fs_method.fit(preprocessed_X_train, y_train)
            selected_features_mask = fs_method.get_support()  # Boolean mask of selected features
            
            # Ensure at least one feature is selected
            if not any(selected_features_mask):
                raise ValueError(f"No features were selected by {fs_name}. Check your feature selection method.")
            
            # Align selected features with original column names
            feature_names = numerical_columns + list(preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(categorical_columns))
            selected_features = np.array(feature_names)[selected_features_mask]
            
            # Transform training and test sets based on selected features
            X_train_selected = pd.DataFrame(fs_method.transform(preprocessed_X_train), columns=selected_features)
            X_test_selected = pd.DataFrame(fs_method.transform(preprocessed_X_test), columns=selected_features)

            # Perform cross-validation with the classifier pipeline
            pipeline = Pipeline(steps=[("classifier", clf)])
            cv = StratifiedKFold(n_splits=10)
            f1_scorer = make_scorer(f1_score)
            cv_scores = cross_val_score(pipeline, X_train_selected, y_train, scoring=f1_scorer, cv=cv)
            
            # Fit the model on the entire training set for final evaluation on test set
            pipeline.fit(X_train_selected, y_train)
            y_pred = pipeline.predict(X_test_selected)

            # Evaluate on the test set and compute metrics
            f1 = f1_score(y_test, y_pred)
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

            # Log hyperparameters and metrics to MLflow/DagsHub
            mlflow.log_param("feature_selection_method", fs_name)
            mlflow.log_param("classifier", clf_name)
            mlflow.log_metric("mean_cv_f1_score", np.mean(cv_scores))
            mlflow.log_metric("std_cv_f1_score", np.std(cv_scores))
            mlflow.log_metric("test_f1_score", f1)
            mlflow.log_metric("tn", tn)
            mlflow.log_metric("fp", fp)
            mlflow.log_metric("fn", fn)
            mlflow.log_metric("tp", tp)

            logger.log_hyperparams({"feature_selection_method": fs_name, "classifier": clf_name})
            logger.log_metrics({
                "mean_cv_f1_score": np.mean(cv_scores),
                "std_cv_f1_score": np.std(cv_scores),
                "test_f1_score": f1,
                "tn": tn,
                "fp": fp,
                "fn": fn,
                "tp": tp,
            })
๐Ÿƒ View run VarianceThreshold_LogisticRegression at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/13ec974412cd4f61b7d24a0a42105207
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
๐Ÿƒ View run VarianceThreshold_RandomForestClassifier at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/8eafe452161c4b12a7903676b42dc086
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:00] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:00] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:01] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:01] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:01] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:01] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:01] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:01] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:01] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:01] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:02] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
๐Ÿƒ View run VarianceThreshold_XGBClassifier at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/1fad6a365dfe4549b72c8ed0b531ad84
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
๐Ÿƒ View run SelectKBest_LogisticRegression at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/77ae92ef31a44b52b15869cfceda8fda
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
๐Ÿƒ View run SelectKBest_RandomForestClassifier at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/dc503ade9f8a4a91a6b8c812e2c33ca8
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:36] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
C:\Users\sahil\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [05:03:37] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
๐Ÿƒ View run SelectKBest_XGBClassifier at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/ca457e338d8b42a3a41419cbc588c694
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
import mlflow
from dagshub import DAGsHubLogger

# Set DagsHub credentials (replace with your actual credentials)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sahilsubhasbhaivachhani"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "379f3e12fb8b9aec32dc4a2b385747be484b3a27"

# Configure MLflow to log to DagsHub
mlflow.set_tracking_uri("https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow")

# Load the dataset
df = pd.read_csv("Life Expectancy Data.csv")

# Define features and target variable
X = df.drop(columns=["Life expectancy ", "Country"])  # Drop target and unrelated columns
y = (df["Life expectancy "] > df["Life expectancy "].median()).astype(int)  # Binary classification based on median

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline for numerical columns (StandardScaler + Log Transformation)
numerical_columns = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()
numerical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log_transform", FunctionTransformer(np.log1p, validate=False)),  # Log transformation: log(1 + x)
    ("scaler", StandardScaler())
])

# Preprocessing pipeline for categorical columns (OneHotEncoding)
categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
categorical_preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_preprocessor, numerical_columns),
        ("cat", categorical_preprocessor, categorical_columns)
    ]
)

# Define ensemble classifiers for evaluation
classifiers = {
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100),
    "VotingClassifier": VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('gb', GradientBoostingClassifier(n_estimators=100))
        ],
        voting='soft'
    )
}

# Initialize DagsHub logger outside of a context manager
logger = DAGsHubLogger()

# Perform evaluation for each classifier
for clf_name, clf in classifiers.items():
    with mlflow.start_run(run_name=f"{clf_name}_Experiment"):
        # Apply preprocessing pipeline to training and test data
        X_train_preprocessed = preprocessor.fit_transform(X_train)
        X_test_preprocessed = preprocessor.transform(X_test)

        # Perform cross-validation with the classifier pipeline
        pipeline = Pipeline(steps=[("classifier", clf)])
        cv = StratifiedKFold(n_splits=10)
        f1_scorer = make_scorer(f1_score)
        cv_scores = cross_val_score(pipeline, X_train_preprocessed, y_train, scoring=f1_scorer, cv=cv)

        # Fit the model on the entire training set for final evaluation on test set
        pipeline.fit(X_train_preprocessed, y_train)
        y_pred = pipeline.predict(X_test_preprocessed)

        # Evaluate on the test set and compute metrics
        f1 = f1_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        # Log hyperparameters and metrics to MLflow/DagsHub
        mlflow.log_param("classifier", clf_name)
        mlflow.log_metric("mean_cv_f1_score", np.mean(cv_scores))
        mlflow.log_metric("std_cv_f1_score", np.std(cv_scores))
        mlflow.log_metric("test_f1_score", f1)
        mlflow.log_metric("tn", tn)
        mlflow.log_metric("fp", fp)
        mlflow.log_metric("fn", fn)
        mlflow.log_metric("tp", tp)

        logger.log_hyperparams({"classifier": clf_name})
        logger.log_metrics({
            "mean_cv_f1_score": np.mean(cv_scores),
            "std_cv_f1_score": np.std(cv_scores),
            "test_f1_score": f1,
            "tn": tn,
            "fp": fp,
            "fn": fn,
            "tp": tp,
        })
๐Ÿƒ View run RandomForest_Experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/5349dcd6089e476582ac05d1843f158b
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
๐Ÿƒ View run GradientBoosting_Experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/f84f8c75b2ec472d8ca2c1bbfa7bae53
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
๐Ÿƒ View run VotingClassifier_Experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5/runs/2a843008f0d4409abbd107d00f1f2893
๐Ÿงช View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/my-first-repo.mlflow/#/experiments/5
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow

# Define experiment names and their corresponding F1-scores (replace with actual values from MLflow)
experiment_data = {
    "Experiment": ["Experiment 1", "Experiment 2", "Experiment 3", "Experiment 4", 
                   "Experiment 5", "Experiment 6", "Experiment 7"],
    "Mean CV F1-Score": [0.82, 0.85, 0.88, 0.83, 0.81, 0.86, 0.89],  # Replace with actual values
    "Test F1-Score": [0.80, 0.84, 0.87, 0.82, 0.79, 0.85, 0.88],      # Replace with actual values
    "CV Std Dev": [0.02, 0.03, 0.01, 0.02, 0.03, 0.02, 0.01]          # Replace with actual values
}

# Convert to DataFrame
df = pd.DataFrame(experiment_data)

# Plot Mean CV F1-Scores and Test F1-Scores
plt.figure(figsize=(12, 6))
sns.barplot(x="Experiment", y="Test F1-Score", data=df, color="skyblue", label="Test F1-Score")
sns.barplot(x="Experiment", y="Mean CV F1-Score", data=df, color="lightgreen", label="Mean CV F1-Score")

# Add error bars for CV Std Dev
plt.errorbar(df["Experiment"], df["Mean CV F1-Score"], yerr=df["CV Std Dev"], fmt='o', color='green', label="CV Std Dev")

# Highlight best model
best_experiment = df.loc[df["Test F1-Score"].idxmax()]
plt.text(best_experiment.name, best_experiment["Test F1-Score"] + 0.01,
         f"Best: {best_experiment['Experiment']}\nF1={best_experiment['Test F1-Score']:.2f}",
         ha='center', color='red', fontsize=10)

# Customize plot
plt.title("F1-Score Comparison Across Experiments")
plt.ylabel("F1-Score")
plt.xlabel("Experiments")
plt.legend(loc="lower right")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Save and log plot in MLflow/DagsHub
plt.savefig("f1_score_comparison.png")
mlflow.log_artifact("f1_score_comparison.png")

# Show plot
plt.show()
C:\Users\sahil\AppData\Local\Temp\ipykernel_11976\2453596086.py:45: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load example dataset
data = load_iris()
X = data.data
y = data.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
final_model = LogisticRegression(max_iter=200)
final_model.fit(X_train, y_train)

# Save the trained model using joblib
joblib.dump(final_model, "final_model.joblib")
print("Model saved successfully as 'final_model.joblib'")
Model saved successfully as 'final_model.joblib'